package org.jzoie.util;

import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

public class TextAnalyzeSpaceUtil 
{
	private List<String> list=null;
	private int maxChineseLength=10;
	public static boolean SEP_NUMBER=false;
	public static boolean SEP_WORD=false;
	public static boolean isFilteHtml=true;
	private String filteStrings="[﻿ `~!@#$%^&*()+=|{}':;',//[//].<>/?~！@#￥%……&*（）——+|{}【】‘；：”“’。，、？]";
	public String innerText(String html)
	{
		if (!isFilteHtml) {
			return html;
		}
		Document doc = Jsoup.parse(html);
		return doc.text();
	}
	public String regFilte(String str,String reg)
	{
        Pattern   p   =   Pattern.compile(reg);      
        Matcher   m   =   p.matcher(str);    
         return   m.replaceAll("").trim();  
	}
	public String remainChinase(String string)
	{
		return string.replaceAll("[^\u4E00-\u9FA5]", "");
	}
	public List<String> getWords(String string)
	{
		List<String> list=new ArrayList<String>();
		 Pattern p = Pattern.compile("[a-zA-Z]+");
		 Matcher m = p.matcher(string);
		 String group="";
		 while (m.find()) 
		 {             
			 group=m.group().trim().toLowerCase();
			 if (group.length()>0)
			 {
				 if (list.contains(group)==false)
				 {
					list.add(group);
				}
			 }
		 }  
		return list;
	}
	public List<String> getNumbers(String string)
	{
		List<String> list=new ArrayList<String>();
		Pattern p = Pattern.compile("[0-9]+");        
		Matcher m = p.matcher(string);
		 String group="";
		 while (m.find()) 
		 {            
			 group=m.group().trim();
			 if (group.length()>0)
			 {
				 if (list.contains(group)==false)
				 {
					list.add(group);
				}
			 }           
		 } 
		
		return list;
	}
	public List<String> getChinese(String string)
	{
		List<String> list=new ArrayList<String>();
		//Pattern p= Pattern.compile("[^a-zA-Z0-9]+");       // [^\u4E00-\u9FA5]"
		Pattern p= Pattern.compile("[\u4E00-\u9FA5]+");
		Matcher m = p.matcher(string);
		 String group="";
		 while (m.find()) 
		 {   
			 group=m.group().trim();
			// System.out.println("group="+group);
			 if (group.length()>0)
			 {
				 if (list.contains(group)==false)
				 {
					list.add(group);
				}
			 }           
		 } 
		// System.out.println("list.size="+list.size());
		return list;
	}
	public String getChineseString(String string)
	{
		StringBuffer buffer=new StringBuffer();
		List<String> list=new ArrayList<String>();
		//Pattern p= Pattern.compile("[^a-zA-Z0-9]+");       // [^\u4E00-\u9FA5]"
		Pattern p= Pattern.compile("[\u4E00-\u9FA5]+");
		Matcher m = p.matcher(string);
		 String group="";
		 while (m.find()) 
		 {   
			 group=m.group().trim()+"";
			 group.replaceAll(" ", "");
			buffer.append(group);      
		 } 
		// System.out.println("list.size="+list.size());
		return buffer.toString();
	}
	public List<String> seperateN(String string,int n,List<String> list)
	{
		if (list==null)
		{
			list=new ArrayList<String>();
		}
		string=string.trim();
		int length=string.length()-n+1;
		String word="";
		for (int i = 0; i < length; i++) 
		{
			word=string.substring(i,i+n);
			if (list.contains(word)==false) 
			{
				list.add(word);
			}
		}
		return list;
	}
	public String spaceAnalyze(String string)
	{
		list=new LinkedList<String>();
		StringBuffer buffer=new StringBuffer();
		//获取中文词句
		List<String> chineseList=getChinese(string);
		//过滤html ,innerText
		string=innerText(string);
		//System.out.println("innerText="+string);
		//获取单词list
		List<String> wordList=getWords(string);
		//获取数字list
		List<String> numberList=getNumbers(string);
		//把数字词句放入wordList,对数字进行分词
		if (SEP_NUMBER) 
		{
			List<String> tempNumberList=new ArrayList<String>();
			for (int i = 0; i < numberList.size(); i++) 
			{
				String tempNumber=numberList.get(i);
				for (int j = 0; j < tempNumber.length(); j++) 
				{
					tempNumberList=seperateN(tempNumber, j+1, tempNumberList);
				}
				//System.out.println("num ["+i+"]="+numberList.get(i));
				//System.out.println("tempNumber="+tempNumberList);
			}
			for (int i = 0; i < tempNumberList.size(); i++) 
			{
				if (list.contains(tempNumberList.get(i))==false)
				{
					list.add(tempNumberList.get(i));
				}
			}
		} else
		{
			for (int i = 0; i < numberList.size(); i++) 
			{
				list.add(numberList.get(i));
			}
		}
		//对单词进行处理
		String tempWord="";
		if (SEP_WORD) 
		{
			List<String> tempList=new ArrayList<String>();
			for (int i = 0; i < wordList.size(); i++) 
			{
				String tempNumber=wordList.get(i);
				for (int j = 0; j < tempNumber.length(); j++) 
				{
					tempList=seperateN(tempNumber, j+1, tempList);
				}
				//System.out.println("num ["+i+"]="+numberList.get(i));
				//System.out.println("tempNumber="+tempNumberList);
			}
			for (int i = 0; i < tempList.size(); i++) 
			{
				tempWord=tempList.get(i)+"";
				tempWord=tempWord.toLowerCase().trim();
				if (list.contains(tempWord)==false)
				{
					list.add(tempWord);
				}
			}
		} else
		{
			for (int i = 0; i < wordList.size(); i++) 
			{
				tempWord=wordList.get(i)+"";
				tempWord=tempWord.toLowerCase().trim();
				if (list.contains(tempWord)==false)
				{
					list.add(tempWord);
				}
			}
		}
		
		//对中文词句进行分词
		List<String> tempList=null;
		int length=0;
		for (int i = 0; i < chineseList.size(); i++)
		{
			tempWord=chineseList.get(i);
			length=tempWord.length();
			if (length >1)
			{
				int start=1;
				int end=length;
				if (length>maxChineseLength)
				{
					end=maxChineseLength;
				}
				for (int j = start; j <= end; j++)
				{
					tempList=seperateN(tempWord, j,null);
					for (int k = 0; k < tempList.size(); k++)
					{
						if (list.contains(tempList.get(k))==false)
						{
							list.add(tempList.get(k));
						}
					}
				}
			} else
			{
				if (list.contains(tempWord)==false)
				{
					list.add(tempWord);
				}
			}
		}
		//空格合并
		for (int i = 0; i < list.size(); i++) 
		{
			buffer.append(list.get(i)+" ");
		}
		
		return buffer.toString();
	}
	public String spaceAnalyzeAppend(String string)
	{
		list=new LinkedList<String>();
		StringBuffer buffer=new StringBuffer();
		//过滤html ,innerText
		string=innerText(string);
		//System.out.println("innerText="+string);
		//获取单词list
		List<String> wordList=getWords(string);
		//获取数字list
		List<String> numberList=getNumbers(string);
		//把数字词句放入wordList,对数字进行分词
		if (SEP_NUMBER) 
		{
			List<String> tempNumberList=new ArrayList<String>();
			for (int i = 0; i < numberList.size(); i++) 
			{
				String tempNumber=numberList.get(i);
				for (int j = 0; j < tempNumber.length(); j++) 
				{
					tempNumberList=seperateN(tempNumber, j+1, tempNumberList);
				}
				//System.out.println("num ["+i+"]="+numberList.get(i));
				//System.out.println("tempNumber="+tempNumberList);
			}
			for (int i = 0; i < tempNumberList.size(); i++) 
			{
				if (list.contains(tempNumberList.get(i))==false)
				{
					list.add(tempNumberList.get(i));
				}
			}
		} else
		{
			for (int i = 0; i < numberList.size(); i++) 
			{
				list.add(numberList.get(i));
			}
		}
		//对单词进行处理
		String tempWord="";
		if (SEP_WORD) 
		{
			List<String> tempList=new ArrayList<String>();
			for (int i = 0; i < wordList.size(); i++) 
			{
				String tempNumber=wordList.get(i);
				for (int j = 0; j < tempNumber.length(); j++) 
				{
					tempList=seperateN(tempNumber, j+1, tempList);
				}
				//System.out.println("num ["+i+"]="+numberList.get(i));
				//System.out.println("tempNumber="+tempNumberList);
			}
			for (int i = 0; i < tempList.size(); i++) 
			{
				tempWord=tempList.get(i)+"";
				tempWord=tempWord.toLowerCase().trim();
				if (list.contains(tempWord)==false)
				{
					list.add(tempWord);
				}
			}
		} else
		{
			for (int i = 0; i < wordList.size(); i++) 
			{
				tempWord=wordList.get(i)+"";
				tempWord=tempWord.toLowerCase().trim();
				if (list.contains(tempWord)==false)
				{
					list.add(tempWord);
				}
			}
		}
		
		//空格合并
		for (int i = 0; i < list.size(); i++) 
		{
			buffer.append(list.get(i)+" ");
		}
		
		//对中文词句进行分词
		String chineseString=getChineseString(string);
		int length=chineseString.length();
		for (int i = 0; i < length; i++)
		{
			buffer.append(chineseString.charAt(i)+" ");
		}
		return buffer.toString();
	}
	public String spaceToOrderAppend(String strs)
	{
		StringBuffer buffer=new StringBuffer();
		if (strs == null) {
			return "";
		}
		
		String text=innerText(strs);
		text=regFilte(text,filteStrings)+"";
		text=text.replaceAll("　", "");
		text=text.toLowerCase();
		String temp="";
		for (int i = 0; i < text.length(); i++)
		{
			temp=text.charAt(i)+"";
			if (" ".equals(temp) == false) {
				buffer.append(temp+" ");
			}
		}
		return buffer.toString();
	}
	public List<String> getList() {
		return list;
	}
	public int getMaxChineseLength() {
		return maxChineseLength;
	}
	public void setMaxChineseLength(int maxChineseLength) {
		this.maxChineseLength = maxChineseLength;
	}
}
